In [10]:
from Bio import AlignIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Graphics import GenomeDiagram
from reportlab.lib import colors
from reportlab.lib.units import cm
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import sys, os
import itertools
%matplotlib inline
################### Function Definitions ######################
def get_diff_locations(ref,compare):
"""
Takes 2 Biopython Seq objects of equal size (aligned).
Returns a list of locations at which base in compare sequence != base in ref sequence.
"""
if len(ref) != len(compare):
raise ValueError('Seqs are not the same length!')
diff_locations = []
for n,(r,c) in enumerate(itertools.izip(ref, compare)):
if r != c:
diff_locations.append(n)
return diff_locations
def plot_tracks(tracks,tracks_path):
'''
Save tracks as pngs, then load them as images and display with matplotlib.
'''
#Write each diagram to a png, then read it back in using Matplotlib
tracks.draw(format='linear', pagesize=(14*cm,7*cm), fragments=1,
start=0, end=13378)
plt.show()
#get extension from tracks path
root, ext = os.path.splitext(tracks_path)
if not ext:
ext = '.png'
tracks_path += ext
tracks.write(tracks_path, ext.strip('.'), dpi=600)
tracks_im = mpimg.imread(tracks_path)
#Plot each set of tracks
fig = plt.figure(figsize=(14,7),dpi=600)
ax = fig.add_axes([0.025,0.025,0.95,0.95],frameon=False)
plt.axis('off')
plt.imshow(tracks_im)
################### End Function Definitions ######################
In [11]:
#Import and sort the aligned sequences
all_aln = AlignIO.read('all_seq.aln','clustal')
all_aln.sort()
In [12]:
#collect ids and separate based on host (letter prefex preceding the year)
all_ids = [i.id for i in all_aln]
h_ids = all_ids[0:4]
s_ids = all_ids[4:]
print all_ids,'\n', h_ids, s_ids
Compare sequence from each year against the same-host sequence from 1935. Each time a difference is found, save the index at which it occurs.
In [13]:
#Compare each year to 1935
h_diffs = {}
s_diffs = {}
h_diffs['h1978'] = get_diff_locations(all_aln[0].seq,all_aln[1].seq)
h_diffs['h2009'] = get_diff_locations(all_aln[0].seq,all_aln[2].seq)
h_diffs['h2014'] = get_diff_locations(all_aln[0].seq,all_aln[3].seq)
s_diffs['s1978'] = get_diff_locations(all_aln[4].seq,all_aln[5].seq)
s_diffs['s2009'] = get_diff_locations(all_aln[4].seq,all_aln[6].seq)
s_diffs['s2014'] = get_diff_locations(all_aln[4].seq,all_aln[7].seq)
In [5]:
def setup(diagram_name, ids, length):
'''
Inits empty track diagram (with diagram_name as title), tracks, and feature sets for all ids.
Adds bright green bg to each track (for contrast).
Returns tracks, features, and feature_sets.
'''
#Create Genome Diagram for human
track_diagram = GenomeDiagram.Diagram(diagram_name+' H1N1 Tracks Plot', tracklines = 0)
track = {}
feature_set = {}
#Generate tracks and feature sets for each year
track_count = 1
for i in ids:
track['%s' %i] = track_diagram.new_track(track_count, name=i, greytrack=False)#greytrack=True,
#greytrack_labels=1, greytrack_fontcolor = colors.cornsilk,
#greytrack_fontsize=15)
feature_set['%s'%i] = track['%s' %i].new_set(name=i)
#add contrast background
feature_set['%s'%i].add_feature(SeqFeature(FeatureLocation(0,length)),
label=True, label_angle=0, color=colors.cadetblue)
track_count += 1
return track_diagram, track, feature_set
h_diagram, h_features, h_feature_set = setup(diagram_name='Human', ids = h_ids, length=all_aln.get_alignment_length())
s_diagram, s_features, s_feature_set = setup(diagram_name='Swine', ids = s_ids, length=all_aln.get_alignment_length())
In [6]:
def add_features_year_compare(feature_set, diffs, hostspecies):
for k, features in feature_set.items():
year = k[1:]
if '1935' in k:
#if the base year...
label = '%s: %s (reference)'%(hostspecies,year)
else:
#name bg feature (which labels the track)
label ='%s: %s --- %i differences vs. 1935'%(hostspecies, year, len(diffs[k]))
# For each difference recorded in diffs, add as a feature to feature_set
for i in diffs[k]:
feature = SeqFeature(FeatureLocation(i,i))
features.add_feature(feature, name=label, color=colors.aqua)
# set the name of the first feature (the background feature) to label
# this is stupid, but I'm not sure how to do it better
features.get_features()[0].name = label
add_features_year_compare(feature_set = h_feature_set, diffs = h_diffs, hostspecies = 'Human')
add_features_year_compare(feature_set = s_feature_set, diffs = s_diffs, hostspecies = 'Swine')
plot_tracks(h_diagram,'h_tracks.png')
plot_tracks(s_diagram, 's_tracks.png')
In [56]:
diffs = {}
diffs['1935'] = get_diff_locations(all_aln[0].seq,all_aln[4].seq)
diffs['1978'] = get_diff_locations(all_aln[1].seq,all_aln[5].seq)
diffs['2009'] = get_diff_locations(all_aln[2].seq,all_aln[6].seq)
diffs['2014'] = get_diff_locations(all_aln[3].seq,all_aln[7].seq)
In [58]:
def compare_strains_by_host(diffs):
#Create Genome Diagram to compare human and swine in each year
diagram = GenomeDiagram.Diagram('Human vs. Swine H1N1 Tracks Plot')
tracks = {}
feature_set = {}
for n,(k,v) in enumerate(diffs.iteritems(),1):
tracks[k] = diagram.new_track(n, greytrack=False)
feature_set[k] = tracks[k].new_set()
return diagram, tracks, feature_set
def add_features_host_compare(feature_set):
for k,v in feature_set.iteritems():
feature_set[k].add_feature(SeqFeature(FeatureLocation(0,13378)),
name = 'Human vs. Swine: %s --- %i differences'%(k,len(diffs[k])),
label=True, label_angle=0, color=colors.cadetblue);
for i in diffs[k]:
feature = SeqFeature(FeatureLocation(i,i))
feature_set[k].add_feature(feature, color=colors.aqua)
diagram, tracks, feature_set = compare_strains_by_host(diffs)
add_features_host_compare(feature_set)
feature_set
plot_tracks(diagram, 'host_compare.png')
In [27]:
#Import the aligned sequences
h_aln = AlignIO.read('all_human.aln','clustal')
h_aln.sort()
print h_aln
s_aln = AlignIO.read('all_swine.aln','clustal')
s_aln.sort()
print s_aln
#get_diff_locations
In [ ]:
In [ ]:
for i,v in h_tracks.tracks.items():
print v.get_sets()
print [i.name for i in h_tracks.tracks.values()]
track_diagram = GenomeDiagram.Diagram("")
track = track_diagram.new_track(1, greytrack=False)
print track.get_sets()
feature_set= track.new_set(name=i)
feature_set.get_features()